import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')
import warnings
def function_that_warns():
warnings.warn("deprecated", DeprecationWarning)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
function_that_warns() # this will not show a warning
path = '~/Desktop/MSCA 31015 DS for consulting/Assignments/Final assignment/'
df = pd.read_excel(path+'Final Assignment - Week 9.xlsx')
df.head(10)
df.shape
selected_col = ['r','f','m']
df[selected_col].describe()
df.r.hist(color = 'steelblue', bins = 20)
plt.title('Recency')
df.f.hist(color = 'steelblue', bins = 20)
plt.title('Frequency')
df.m.hist(color = 'steelblue', bins = 20)
plt.title('Revenue')
df2 = df[['id','r','f','m','tof']]
df2.head(10)
# get all unique vistor ids
all_customers = df2.id.unique()
all_customers.size
# no repeated customers in the dataset
#get all the customers who bought something
customer_purchased = df2[df2.m != 0].id.sort_values().unique()
customer_purchased.size
customer_browsed = [x for x in all_customers if x not in customer_purchased]
len(customer_browsed)
#There are 891 customers did not buy in the dataset.
from sklearn.cluster import KMeans
sse={}
tx_recency = df2[['r']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
tx_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
# choose k = 4
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=4)
kmeans.fit(df2[['r']])
df2['RecencyCluster'] = kmeans.predict(df2[['r']])
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
df2 = order_cluster('RecencyCluster', 'r',df2,False)
df2.groupby('RecencyCluster')['r'].describe()
0 are the most inactive customers and 3 are the most recent customers
# apply k-means based on frequency
kmeans = KMeans(n_clusters=4)
kmeans.fit(df2[['f']])
df2['FrequencyCluster'] = kmeans.predict(df2[['f']])
#order the frequency cluster
df2 = order_cluster('FrequencyCluster', 'f',df2,True)
#see details of each cluster
df2.groupby('FrequencyCluster')['f'].describe()
high frequency number indicates better customers
#apply clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(df2[['m']])
df2['RevenueCluster'] = kmeans.predict(df2[['m']])
#order the cluster numbers
df2 = order_cluster('RevenueCluster', 'm',df2,True)
#show details of the dataframe
df2.groupby('RevenueCluster')['m'].describe()
#calculate overall score and use mean() to see details
df2['OverallScore'] = df2['RecencyCluster'] + df2['FrequencyCluster'] + df2['RevenueCluster']
df2.groupby('OverallScore')['r','f','m'].mean()
The scoring above clearly shows us that customers with score 7 is our best customers whereas 0 is the worst.
df2['Segment'] = 'Low-Value'
df2.loc[df2['OverallScore']>2,'Segment'] = 'Mid-Value'
df2.loc[df2['OverallScore']>4,'Segment'] = 'High-Value'
#Revenue vs Frequency
graph = df2.query("m < 50000 and f < 2000")
plot_data = [
go.Scatter(
x=graph.query("Segment == 'Low-Value'")['f'],
y=graph.query("Segment == 'Low-Value'")['m'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=graph.query("Segment == 'Mid-Value'")['f'],
y=graph.query("Segment == 'Mid-Value'")['m'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=graph.query("Segment == 'High-Value'")['f'],
y=graph.query("Segment == 'High-Value'")['m'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Frequency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#Revenue vs Recency
graph = df2.query("m < 50000 and f < 2000")
plot_data = [
go.Scatter(
x=graph.query("Segment == 'Low-Value'")['f'],
y=graph.query("Segment == 'Low-Value'")['m'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=graph.query("Segment == 'Mid-Value'")['r'],
y=graph.query("Segment == 'Mid-Value'")['m'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=graph.query("Segment == 'High-Value'")['r'],
y=graph.query("Segment == 'High-Value'")['m'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#Frequency vs Recency
graph = df2.query("r < 50000 and f < 2000")
plot_data = [
go.Scatter(
x=graph.query("Segment == 'Low-Value'")['r'],
y=graph.query("Segment == 'Low-Value'")['f'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=graph.query("Segment == 'Mid-Value'")['r'],
y=graph.query("Segment == 'Mid-Value'")['f'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=graph.query("Segment == 'High-Value'")['r'],
y=graph.query("Segment == 'High-Value'")['f'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Frequency"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
From the graph we can know the distribution of low, mid and high value customers via different dimensions. We can conclude that we need to improve retention on high-value customers, improve retention and increase frequency on mid-value customers and increase frequency on low-value customers.